Studies
n_studies_total <- nrow(studies)
sprintf("Total trials (NCT_IDs): %d", n_studies_total)
## [1] "Total trials (NCT_IDs): 300214"
studies <- studies[studies$study_type=="Interventional",]
studies$study_type <- NULL
n_studies_itv <- nrow(studies)
sprintf("Interventional trials: %d (%.1f%%)", n_studies_itv, 100*n_studies_itv/n_studies_total)
## [1] "Interventional trials: 237892 (79.2%)"
studies$phase[studies$phase == "N/A"] <- NA
"===All studies, phase:"
## [1] "===All studies, phase:"
tbl <- table(studies$phase, useNA="ifany")
#sprintf("%18s: %6d", names(tbl), tbl)
knitr::kable(data.frame(tbl), caption="All studies, by phase")
All studies, by phase
| Early Phase 1 |
2619 |
| Phase 1 |
29795 |
| Phase 1/Phase 2 |
10063 |
| Phase 2 |
41637 |
| Phase 2/Phase 3 |
4963 |
| Phase 3 |
29662 |
| Phase 4 |
25001 |
| NA |
94152 |
| ### Drugs (id, nct |
_id, name) |
| “id” is AACT_ID |
|
studies <- merge(studies, dplyr::rename(drugs, drug_name = name, drug_itv_id = id), by="nct_id", all=T)
studies[["is_drug_trial"]] <- !is.na(studies$drug_itv_id)
drugs <- merge(drugs, studies, by="nct_id", all.x=T, all.y=F)
drugs <- drugs[order(drugs$name),]
"===All drugs, phase:"
## [1] "===All drugs, phase:"
tbl <- table(drugs$phase, useNA="ifany")
sprintf("%18s: %6d", names(tbl), tbl)
## [1] " Early Phase 1: 5947" " Phase 1: 154025"
## [3] " Phase 1/Phase 2: 44284" " Phase 2: 212320"
## [5] " Phase 2/Phase 3: 17815" " Phase 3: 161245"
## [7] " Phase 4: 92565" " NA: 73388"
sprintf("Drug trials (NCT_IDs): %d", length(unique(drugs$nct_id)))
## [1] "Drug trials (NCT_IDs): 129628"
sprintf("Unique drug names: %d", length(unique(drugs$name)))
## [1] "Unique drug names: 91347"
NextMove Leadmine NER
drugs_leadmine <- dplyr::rename(drugs_leadmine, id = DocName, smiles = ResolvedForm)
#
drugs <- merge(drugs, drugs_leadmine, by="id")
drugs[["resolved_structure"]] <- !is.na(drugs$smiles)
"===Drugs, resolved structure:"
## [1] "===Drugs, resolved structure:"
tbl <- table(drugs$resolved_structure)
sprintf("%18s: %6d", names(tbl), tbl)
## [1] " FALSE: 55047" " TRUE: 543513"
"===Drugs, overall_status:"
## [1] "===Drugs, overall_status:"
tbl <- table(drugs$overall_status)
sprintf("%18s: %6d", names(tbl), tbl)
## [1] "Active, not recruiting: 36946" " Completed: 337892"
## [3] "Enrolling by invitation: 1859" "Not yet recruiting: 16986"
## [5] " Recruiting: 79304" " Suspended: 2787"
## [7] " Terminated: 52527" " Unknown status: 41516"
## [9] " Withdrawn: 15141"
Studies by year
## Warning: Ignoring 1 observations
### Studies by classification
### Aggregate mentions by intervention ID.
ner <- drugs_leadmine[!is.na(drugs_leadmine$smiles),] %>% group_by(id) %>% summarise(n = n())
sprintf("Mentions by intervention ID: %.1f%% (%d/%d)",
100*nrow(ner)/length(unique(drugs$id)),
nrow(ner), length(unique(drugs$id)))
## [1] "Mentions by intervention ID: 91.9% (157862/171741)"
Aggregate mentions by trial.
drugs_leadmine <- merge(drugs_leadmine, drugs[,c("drug_itv_id", "nct_id")], by.x="id", by.y="drug_itv_id")
ner <- drugs_leadmine[!is.na(drugs_leadmine$smiles),] %>% group_by(nct_id) %>% summarise(n = n())
sprintf("Mentions by study: %.1f%% (%d/%d)",
100*nrow(ner)/length(unique(drugs$nct_id)),
nrow(ner), length(unique(drugs$nct_id)))
## [1] "Mentions by study: 93.3% (92966/99647)"
Aggregate mentions by drug.
ner <- drugs_leadmine[!is.na(drugs_leadmine$smiles),] %>% group_by(OriginalText) %>% summarise(n = n())
sprintf("Mentions by drug name: %.1f%% (%d/%d)",
100*nrow(ner)/length(unique(drugs$name)),
nrow(ner), length(unique(drugs$name)))
## [1] "Mentions by drug name: 19.1% (11108/58297)"
PUBCHEM:
Intervention IDs to CIDs from PubChem (via SMILES)
drug2cid <- drug2cid[!is.na(drug2cid$cid),]
drug2cid <- drug2cid[drug2cid$cid!=0,]
drug2cid <- merge(drug2cid, unique(drugs[,c("smiles","id")]), all.x=F, all.y=F, by="smiles")
drug2cid <- dplyr::rename(drug2cid, itv_id = "id")
drug2cid$smiles <- NULL
drug2cid$names <- NULL
drug2cid <- unique(drug2cid)
sprintf("Intervention IDs mapped to PubChem CIDs (via SMILES): %d", nrow(drug2cid))
## [1] "Intervention IDs mapped to PubChem CIDs (via SMILES): 153876"
write_delim(drug2cid, "../data/aact_drugs_itvid2cid.tsv", delim="\t")
InChIKeys from PubChem (via CIDs)
sprintf("PubChem CIDs with InChIKeys: %d", nrow(pubchem))
## [1] "PubChem CIDs with InChIKeys: 3801"
CHEMBL:
ChEMBL molecule IDs, and properties (via InChIKeys)
ChEMBL activities (via compounds)
ChEMBL target IDs (via activities)
IDG/TCRD:
tcrd_tgt <- read_delim("~/src/TCRD_tools/data/pharos_targets.tsv", "\t")
## Parsed with column specification:
## cols(
## .default = col_double(),
## idgFamily = col_character(),
## accession = col_character(),
## self = col_character(),
## grantCount = col_logical(),
## description = col_character(),
## kind = col_character(),
## name = col_character(),
## r01Count = col_logical(),
## deprecated = col_logical(),
## grantTotalCost = col_logical(),
## idgTDL = col_character(),
## pubmedCount = col_logical(),
## gene = col_character()
## )
## See spec(...) for full column specifications.
tgt <- merge(chembl_tgt, tcrd_tgt, all.x=T, all.y=F, by.x="accession", by.y="accession")
sprintf("ChEMBL target proteins mapped to TCRD (human): %d",
nrow(tgt[!is.na(tgt$idgTDL),]))
## [1] "ChEMBL target proteins mapped to TCRD (human): 1806"
setDT(tgt)
sprintf("Organisms: %d", length(unique(tgt$organism)))
## [1] "Organisms: 187"
"===Targets by organism (top 10):"
## [1] "===Targets by organism (top 10):"
org_counts <- tgt[, .(.N), by = "organism"][order(-N)][1:10, ]
sprintf("%28s: %6d", org_counts$organism, org_counts$N)
## [1] " Homo sapiens: 1806"
## [2] " Rattus norvegicus: 529"
## [3] " Mus musculus: 238"
## [4] " Bos taurus: 98"
## [5] " Sus scrofa: 36"
## [6] " Cavia porcellus: 26"
## [7] " Escherichia coli K-12: 19"
## [8] " Oryctolagus cuniculus: 18"
## [9] " Escherichia coli: 17"
## [10] " Mycobacterium tuberculosis: 17"
"===Targets, TDL for human:"
## [1] "===Targets, TDL for human:"
tdl_counts <- tgt[organism == "Homo sapiens", .(.N), by = "idgTDL"]
sprintf("%8s: %6d", tdl_counts$idgTDL, tdl_counts$N)
## [1] " Tbio: 224" " Tchem: 868" " Tdark: 7"
## [4] " Tclin: 707"